In [21]:
import pandas as pd

import emoji
import re

import random
random.seed(42)

from itertools import product

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from hdbscan import HDBSCAN

import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

#allows plotly visuals to show up in HTML
from plotly import offline
offline.init_notebook_mode(connected = True)

import plotly.io as pio
pio.renderers.default = 'notebook'

import warnings
warnings.filterwarnings('ignore', message = 'IProgress not found')
In [5]:
#data import
reviews = pd.read_csv('/home/amybirdee/hobby_projects/bumble_reviews_bertopic_analysis/bumble_google_play_reviews.csv')
In [6]:
#review data
pd.set_option('max_colwidth', 30)
reviews.head()
Out[6]:
reviewId userName userImage content score thumbsUpCount reviewCreatedVersion at replyContent repliedAt
0 gp:AOqpTOGCWX1s8NNKNYQ5uMK... Javed Sahiad https://play-lh.googleuser... Bumble rocks 👍🤞 5 0 5.253.0 28/03/2022 23:33 NaN NaN
1 gp:AOqpTOFmBlHJLKFu87RU2d-... Justin Miller https://play-lh.googleuser... Just a cash grab. Congrats... 1 0 5.262.0 28/03/2022 23:23 NaN NaN
2 gp:AOqpTOHAcFeMyfgGN_6k7HR... Adam Poots https://play-lh.googleuser... Terrible, l have lost tota... 1 15 5.260.0 28/03/2022 23:10 Uh oh! We'll need some mor... 23/02/2022 12:33
3 gp:AOqpTOEinp2G2V2iYesrwB6... Jonnalyn Gonzales https://play-lh.googleuser... Easy to use 5 0 5.261.1 28/03/2022 22:28 NaN NaN
4 gp:AOqpTOG4ElAfNPCFwqm-WSR... Stetson&Spurs https://play-lh.googleuser... Just alot of gimmicks, fak... 1 0 5.262.0 28/03/2022 22:20 NaN NaN

Data Exploration and Pre-Processing

In [7]:
#check shape pf dataframe - 10 columns, 105,438 rows
reviews.shape
Out[7]:
(105438, 10)
In [8]:
#check data types 
reviews.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105438 entries, 0 to 105437
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              105438 non-null  object
 1   userName              105437 non-null  object
 2   userImage             105438 non-null  object
 3   content               105429 non-null  object
 4   score                 105438 non-null  int64 
 5   thumbsUpCount         105438 non-null  int64 
 6   reviewCreatedVersion  88667 non-null   object
 7   at                    105438 non-null  object
 8   replyContent          61241 non-null   object
 9   repliedAt             61241 non-null   object
dtypes: int64(2), object(8)
memory usage: 8.0+ MB
In [9]:
#checking dataframe summary
reviews.describe(include = 'all').T
Out[9]:
count unique top freq mean std min 25% 50% 75% max
reviewId 105438 105438 gp:AOqpTOEhR4C-Ep8Fznetxvz... 1 NaN NaN NaN NaN NaN NaN NaN
userName 105437 95283 A Google user 4746 NaN NaN NaN NaN NaN NaN NaN
userImage 105438 100694 https://play-lh.googleuser... 4745 NaN NaN NaN NaN NaN NaN NaN
content 105429 91047 Good 1255 NaN NaN NaN NaN NaN NaN NaN
score 105438.0 NaN NaN NaN 2.829284 1.719196 1.0 1.0 3.0 5.0 5.0
thumbsUpCount 105438.0 NaN NaN NaN 2.737533 18.089949 0.0 0.0 0.0 1.0 1275.0
reviewCreatedVersion 88667 369 1.15.0 1659 NaN NaN NaN NaN NaN NaN NaN
at 105438 102652 23/04/2019 10:40 40 NaN NaN NaN NaN NaN NaN NaN
replyContent 61241 34418 We're sorry you didn't enj... 1489 NaN NaN NaN NaN NaN NaN NaN
repliedAt 61241 46108 21/04/2017 08:42 34 NaN NaN NaN NaN NaN NaN NaN
In [10]:
#create df with just the content and review ID column
review_content = reviews[['reviewId', 'content']]
In [11]:
#dropping null values in content column
review_content = review_content.dropna(subset = ['content']).reset_index(drop = True)
In [12]:
#no null values in content column
review_content.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105429 entries, 0 to 105428
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   reviewId  105429 non-null  object
 1   content   105429 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB

Text processing - Emojis

In [13]:
#define function to remove emojis
def remove_emojis(text):
    text = emoji.replace_emoji(text, replace = '')
    text = re.sub(r":[a-zA-Z_]+:", "", text)
    return text
In [14]:
#apply function to content column
review_content['content'] = review_content['content'].apply(remove_emojis)
In [15]:
#increase the column size so we can read the reviews properly
pd.set_option('max_colwidth', 800)
In [16]:
#emojis have been removed
review_content.head(10)
Out[16]:
reviewId content
0 gp:AOqpTOGCWX1s8NNKNYQ5uMKTqiOTl1wci0EXqB1YMF4KnryRJC4l3TL2crSvzhwIXIjA3u6_wlKbs_4md1c7Vg Bumble rocks
1 gp:AOqpTOFmBlHJLKFu87RU2d-AKSXiIzUd_utDNWBKhswdrUJKFpCNitITSHp77BP-5Wm71YcxzkkT_Pq1k9EFwQ Just a cash grab. Congrats you have 2 new likes! Here we'll let you swipe right until you find them. Oh no, you're out of likes, but don't worry we'll let you buy more for $19 a month.
2 gp:AOqpTOHAcFeMyfgGN_6k7HRCTRwYS2q1mKCA_t5YIkfN_GoU0sIzR8f7VhR6tjO6x-tXqsDv-K7wDHYcH__l_g Terrible, l have lost total faith in this app, promised to contact me but have never came back since start of Feb, shame on this app. Worst ever
3 gp:AOqpTOEinp2G2V2iYesrwB6DaGM4ygaELm2uJlc3z46TFDkAzseIUYiF0NyrHGRcbGGXxs2guTLbewoW8gqtng Easy to use
4 gp:AOqpTOG4ElAfNPCFwqm-WSRa9SddGNRvgxIosHcujGVv178-L3NEj-3pO62RN7wEXofIz_Io4lyGpxgPTgykzg Just alot of gimmicks, fake profile, and upsales. Oh and if you decide to delete your account and leave a negative review, they block your email, or phone number from re-registering in the future lol.
5 gp:AOqpTOHK9Rwo_LnDwFjeWU35bXhpq2CMQ7N-QoPb1JAA0Yr-nFK5iuO-DOVMZq2fdKeuOz5C3xdTzWRdSnMqCQ Terrible app to many fake ppl and if they are real the dont talk
6 gp:AOqpTOHhR5TSZr1NkY09bJtJ-KWUeTLtDLnVYipg2CJRcIjOwG25wVC0qh-nlIZdne3rSexpVRD4EHsBZwomHQ I can't log in and tried to sign in multiple times after multiple reinstalling, i tried to sign in via fb and mobile number but it keeps returning to the login page after phone number verification??? This has been going on for two weeks now.
7 gp:AOqpTOHIvPlNBpoyLKexzRV8hj-6dELQ2nO_3VhYsjxEEZJPQuhg2HYgSxnyrHPp3vjrGgfciRu7kXP4Y_vtUg Easy to use!
8 gp:AOqpTOEYIceMzj8-T4t_9MLIZNk6r79aUMGvHcPePFZtJ6C4JGYCuTswb0A6fRrvLI9deK0t_iA6B_ZU1nd1dA I was Facing A problem with the uploaded picture
9 gp:AOqpTOGeozzQig8QuOaSdP_fdY4eewDJIi7PiWLesaL36bI9LGjsKNi19sWRSoG4OdN4Vx0lvbCYXQ1gz9IOMA Great site with very little spammers

Topic Model

In [14]:
# send content text to a list - have to use a sample of the original data otherwise we face memory errors
docs_sample = review_content['content'].sample(n = 10000, random_state = 42).tolist()

# create a parameter grid to test different combinations of parameters
param_grid = {'embedding model': ['sentence-transformers/all-MiniLM-L6-v2', 
                                  'sentence-transformers/all-mpnet-base-v2', 
                                  'sentence-transformers/paraphrase-MiniLM-L6-v2'],
             'nr_topics': [50, 75, 100],
             'min_cluster_size': [10, 15, 20],
             'n_neighbors': [10, 15, 30],
             'n_components': [3, 5, 7],
             'min_dist': [0.0, 0.1, 0.2],
             'ngram_range': [(1, 1), (1, 2), (1, 3)],
             'min_df': [1, 0.01, 0.03],
             'umap_metric': ['euclidean', 'cosine'],
             'hdbscan_metric': ['euclidean', 'manhattan'],
             'cluster_selection_method': ['eom', 'leaf']}
In [15]:
# create all possible combinations
all_combinations = list(product(
    param_grid['embedding model'],
    param_grid['nr_topics'],
    param_grid['min_cluster_size'],
    param_grid['n_neighbors'],
    param_grid['n_components'],
    param_grid['min_dist'],
    param_grid['ngram_range'],
    param_grid['min_df'],
    param_grid['umap_metric'],
    param_grid['hdbscan_metric'],
    param_grid['cluster_selection_method']
))

# randomly sample N combinations
N = 50
sampled_combinations = random.sample(all_combinations, N)
In [16]:
# run the topic modela and save coherence score to a list
results = []

for i, (embedding_model, nr_topics, min_cluster_size, n_neighbors, n_components, min_dist, ngram_range,
       min_df, umap_metric, hdbscan_metric, cluster_selection_method,) in enumerate(sampled_combinations):
    print(f"\nRunning combination {i+1}/{N}:")
    print(f"embedding_model = {embedding_model}, nr_topics = {nr_topics}, min_cluster_size = {min_cluster_size},\
         n_neighbors = {n_neighbors}, n_components = {n_components}, min_dist = {min_dist}, ngram_range = \
         {ngram_range}, min_df = {min_df}, umap_metric = {umap_metric}, hdbscan_metric = {hdbscan_metric},\
         cluster_selection_method = {cluster_selection_method}")
    
    # this transforms text data into numerical vectors
    embedding = embedding_model
    
    sentence_model = SentenceTransformer(embedding)
    
    # the transformed text has too many dimensions to apply clustering so we need to define a dimensionality
    # reduction technique
    umap_model = UMAP(
    n_neighbors = n_neighbors,
    n_components = n_components,
    min_dist = min_dist,
    metric = umap_metric,
    random_state = 42,)

    # after dimensionality reduction, we need to cluster the data
    hdbscan_model = HDBSCAN(
    min_cluster_size = min_cluster_size,
    metric = hdbscan_metric,
    cluster_selection_method = cluster_selection_method,
    prediction_data = True,)
    
    # tokenisation: removing stop words and getting the most common words
    vectorizer_model = CountVectorizer(
    stop_words = 'english', ngram_range = ngram_range, min_df = min_df)
    
    # BERTopic
    topic_model = BERTopic(
    embedding_model = sentence_model,
    umap_model = umap_model,
    hdbscan_model = hdbscan_model,
    vectorizer_model = vectorizer_model,
    calculate_probabilities = True,
    verbose = False,
    low_memory = True,
    top_n_words = 10,
    nr_topics = nr_topics,)
    
    topics, probabilities = topic_model.fit_transform(docs_sample)
    
    # Below we use a BERTopic based 'Coherence Cv' metric calculation. Coherence is a way to measure how well
    # documents in a cluster cohere to a particular theme. It's measured on a 0-1 scale with 0 being no coherence
    # and 1 being perfect coherence. The general benchmarks are: poor (0.0-0.3), fair (0.3-0.5), good (0.5-0.7)
    # and excellent (0.7+)
    
    # This metric basically takes the top_n_words in each topic and checks how often they occur together in the
    # same context within a cluster, and compares that to how often these words would be expected to appear
    # near one another just due to chance. The more they appear within the same context (vs just due to chance)
    # the higher the Coherence Cv score
    
    # preprocess documents using BERTopic's internal preprocessing
    cleaned_docs = topic_model._preprocess_text(docs_sample)
    
    # extract vectorizer and tokenizer from BERTopic
    vectorizer = topic_model.vectorizer_model
    tokenizer = vectorizer.build_tokenizer()
    
    # tokenize documents
    tokens = [tokenizer(doc) for doc in cleaned_docs]
    
    # build gensim dictionary/corpus for this run
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(t) for t in tokens]
    
    # create a Gensim doctionary from tokens
    topic_words = []
    for topic in range(len(topic_model.get_topic_freq()) - 1):
        
        # attempt to retrieve the top words and their probabilities for each topic
        topic_terms = topic_model.get_topic(topic)
        
        # only add words that are in the dictionary
        if topic_terms:
            valid_words = [word for word, _ in topic_terms if word in dictionary.token2id]
            if valid_words:
                topic_words.append(valid_words)
            
    # evaluate coherence
    if topic_words:
        coherence_model = CoherenceModel(
        topics = topic_words,
        texts = tokens,
        corpus = corpus,
        dictionary = dictionary,
        coherence = 'c_v',
        processes = 1,)
        
        coherence = coherence_model.get_coherence()
    else:
        coherence = float('nan')
        
    print(f"Coherence Score (c_v): {coherence}")
    results.append(
    {'params': {'embedding_model': embedding_model,
                'nr_topics': nr_topics, 
                'min_cluster_size': min_cluster_size,
                'n_neighbors': n_neighbors, 
                'n_components': n_components, 
                'min_dist': min_dist, 
                'ngram_range': ngram_range, 
                'min_df': min_df, 
                'umap_metric': umap_metric, 
                'hdbscan_metric': hdbscan_metric,
                'cluster_selection_method': cluster_selection_method,},
    'coherence': coherence,}) 
Running combination 1/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 15, n_components = 7, min_dist = 0.0, ngram_range =          (1, 1), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.41459138792844635

Running combination 2/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 30, n_components = 3, min_dist = 0.2, ngram_range =          (1, 2), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.5143768515973577

Running combination 3/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 30, n_components = 5, min_dist = 0.1, ngram_range =          (1, 3), min_df = 1, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5813611993081119

Running combination 4/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 10,         n_neighbors = 30, n_components = 7, min_dist = 0.2, ngram_range =          (1, 3), min_df = 0.03, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.4847996472661999

Running combination 5/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 10, n_components = 7, min_dist = 0.1, ngram_range =          (1, 2), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.5446077861906513

Running combination 6/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 20,         n_neighbors = 10, n_components = 7, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.03, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5479261149917335

Running combination 7/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 15,         n_neighbors = 15, n_components = 5, min_dist = 0.2, ngram_range =          (1, 1), min_df = 0.01, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.43415317904734896

Running combination 8/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 15,         n_neighbors = 30, n_components = 3, min_dist = 0.1, ngram_range =          (1, 1), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.43315670347819674

Running combination 9/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 10,         n_neighbors = 30, n_components = 5, min_dist = 0.1, ngram_range =          (1, 2), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5969765554941723

Running combination 10/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 15, n_components = 5, min_dist = 0.0, ngram_range =          (1, 1), min_df = 0.03, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.4528264610938935

Running combination 11/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 15,         n_neighbors = 30, n_components = 5, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.03, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.49043498980965683

Running combination 12/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 10,         n_neighbors = 30, n_components = 7, min_dist = 0.2, ngram_range =          (1, 1), min_df = 0.01, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.3864338169880382

Running combination 13/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 15, n_components = 3, min_dist = 0.1, ngram_range =          (1, 2), min_df = 1, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.44875760061863446

Running combination 15/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 15,         n_neighbors = 30, n_components = 7, min_dist = 0.0, ngram_range =          (1, 2), min_df = 0.01, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5667151221929627

Running combination 16/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 75, min_cluster_size = 20,         n_neighbors = 10, n_components = 7, min_dist = 0.0, ngram_range =          (1, 1), min_df = 1, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.44835839635988894

Running combination 17/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 15,         n_neighbors = 10, n_components = 3, min_dist = 0.1, ngram_range =          (1, 3), min_df = 0.03, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5202265747855028

Running combination 18/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 15,         n_neighbors = 10, n_components = 3, min_dist = 0.0, ngram_range =          (1, 1), min_df = 0.01, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.46187882634824556

Running combination 19/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 10, n_components = 5, min_dist = 0.1, ngram_range =          (1, 1), min_df = 0.03, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.46510533443801394

Running combination 20/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 15,         n_neighbors = 15, n_components = 3, min_dist = 0.1, ngram_range =          (1, 1), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.4134971470817613

Running combination 21/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 15,         n_neighbors = 30, n_components = 5, min_dist = 0.1, ngram_range =          (1, 3), min_df = 1, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.6604798272370227

Running combination 22/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 100, min_cluster_size = 20,         n_neighbors = 10, n_components = 3, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.03, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5311392706446116

Running combination 23/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 20,         n_neighbors = 10, n_components = 7, min_dist = 0.1, ngram_range =          (1, 3), min_df = 0.03, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5444210390588129

Running combination 24/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 30, n_components = 7, min_dist = 0.0, ngram_range =          (1, 1), min_df = 0.01, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.4467320412274948

Running combination 25/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 30, n_components = 7, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.01, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.54761783156318

Running combination 26/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 10,         n_neighbors = 30, n_components = 3, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.03, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.49904030742422817

Running combination 27/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 10,         n_neighbors = 10, n_components = 5, min_dist = 0.0, ngram_range =          (1, 3), min_df = 1, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5697862009487077

Running combination 28/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 30, n_components = 7, min_dist = 0.0, ngram_range =          (1, 2), min_df = 0.01, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5459260533655138

Running combination 29/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 20,         n_neighbors = 15, n_components = 7, min_dist = 0.2, ngram_range =          (1, 2), min_df = 1, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5847599345429617

Running combination 30/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 15, n_components = 3, min_dist = 0.1, ngram_range =          (1, 1), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.43414131565489306

Running combination 31/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 75, min_cluster_size = 20,         n_neighbors = 10, n_components = 5, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.01, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.620040161409684

Running combination 32/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 15,         n_neighbors = 15, n_components = 3, min_dist = 0.2, ngram_range =          (1, 2), min_df = 0.03, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5213019020147512

Running combination 33/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 100, min_cluster_size = 10,         n_neighbors = 10, n_components = 5, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.01, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5900213963576945

Running combination 34/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 15,         n_neighbors = 30, n_components = 5, min_dist = 0.2, ngram_range =          (1, 2), min_df = 1, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5840459062284179

Running combination 35/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 15, n_components = 3, min_dist = 0.1, ngram_range =          (1, 1), min_df = 0.01, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.4283365697528902

Running combination 36/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 10, n_components = 5, min_dist = 0.2, ngram_range =          (1, 3), min_df = 0.03, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5304661172065325

Running combination 37/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 15,         n_neighbors = 15, n_components = 7, min_dist = 0.0, ngram_range =          (1, 3), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.6370141407974721

Running combination 38/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 20,         n_neighbors = 15, n_components = 3, min_dist = 0.1, ngram_range =          (1, 1), min_df = 0.03, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.4844171236887627

Running combination 39/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 20,         n_neighbors = 15, n_components = 5, min_dist = 0.2, ngram_range =          (1, 2), min_df = 0.01, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5745518213936673

Running combination 40/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 75, min_cluster_size = 20,         n_neighbors = 10, n_components = 7, min_dist = 0.0, ngram_range =          (1, 3), min_df = 1, umap_metric = euclidean, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.6255654526479489

Running combination 41/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 50, min_cluster_size = 20,         n_neighbors = 15, n_components = 5, min_dist = 0.0, ngram_range =          (1, 3), min_df = 1, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5432535438060428

Running combination 42/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 50, min_cluster_size = 10,         n_neighbors = 15, n_components = 3, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.03, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.4723800511312015

Running combination 43/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 20,         n_neighbors = 10, n_components = 7, min_dist = 0.0, ngram_range =          (1, 2), min_df = 0.01, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5457991629483518

Running combination 44/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 15,         n_neighbors = 10, n_components = 7, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.03, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.49300997726545204

Running combination 45/50:
embedding_model = sentence-transformers/paraphrase-MiniLM-L6-v2, nr_topics = 100, min_cluster_size = 15,         n_neighbors = 30, n_components = 3, min_dist = 0.1, ngram_range =          (1, 3), min_df = 0.03, umap_metric = cosine, hdbscan_metric = euclidean,         cluster_selection_method = eom
Coherence Score (c_v): 0.5061407552212335

Running combination 46/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 50, min_cluster_size = 20,         n_neighbors = 15, n_components = 3, min_dist = 0.0, ngram_range =          (1, 2), min_df = 1, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = leaf
Coherence Score (c_v): 0.5174055288559836

Running combination 47/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 15, n_components = 5, min_dist = 0.0, ngram_range =          (1, 1), min_df = 1, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.4205060055697827

Running combination 48/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 10, n_components = 5, min_dist = 0.0, ngram_range =          (1, 2), min_df = 1, umap_metric = cosine, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.528658959711193

Running combination 49/50:
embedding_model = sentence-transformers/all-mpnet-base-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 30, n_components = 5, min_dist = 0.0, ngram_range =          (1, 3), min_df = 0.01, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.5665746807284512

Running combination 50/50:
embedding_model = sentence-transformers/all-MiniLM-L6-v2, nr_topics = 75, min_cluster_size = 10,         n_neighbors = 10, n_components = 7, min_dist = 0.1, ngram_range =          (1, 1), min_df = 1, umap_metric = euclidean, hdbscan_metric = manhattan,         cluster_selection_method = eom
Coherence Score (c_v): 0.4248026973392618
In [17]:
# sort by coherence scores and find best parameters
results = sorted(results, key = lambda x: x['coherence'], reverse = True)
print('Top 3 parameter combinations:')
for result in results[:3]:
    print(result)
Top 3 parameter combinations:
{'params': {'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2', 'nr_topics': 100, 'min_cluster_size': 15, 'n_neighbors': 30, 'n_components': 5, 'min_dist': 0.1, 'ngram_range': (1, 3), 'min_df': 1, 'umap_metric': 'cosine', 'hdbscan_metric': 'manhattan', 'cluster_selection_method': 'leaf'}, 'coherence': np.float64(0.6604798272370227)}
{'params': {'embedding_model': 'sentence-transformers/paraphrase-MiniLM-L6-v2', 'nr_topics': 100, 'min_cluster_size': 15, 'n_neighbors': 15, 'n_components': 7, 'min_dist': 0.0, 'ngram_range': (1, 3), 'min_df': 1, 'umap_metric': 'euclidean', 'hdbscan_metric': 'euclidean', 'cluster_selection_method': 'leaf'}, 'coherence': np.float64(0.6370141407974721)}
{'params': {'embedding_model': 'sentence-transformers/all-mpnet-base-v2', 'nr_topics': 75, 'min_cluster_size': 20, 'n_neighbors': 10, 'n_components': 7, 'min_dist': 0.0, 'ngram_range': (1, 3), 'min_df': 1, 'umap_metric': 'euclidean', 'hdbscan_metric': 'euclidean', 'cluster_selection_method': 'eom'}, 'coherence': np.float64(0.6255654526479489)}

Results

In [17]:
# send full content text to a list
docs = review_content['content'].tolist()

# fit the full dataset using the best parameters found above
final_sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

final_umap_model = UMAP(n_neighbors = 30, n_components = 5, min_dist = 0.1, metric = 'cosine', 
                        random_state = 42)

final_hdbscan_model = HDBSCAN(min_cluster_size = 15, metric = 'manhattan', cluster_selection_method = 'leaf',
                             prediction_data = False)

# Tokenisation, removing stop words and getting the most common words
final_vectorizer_model = CountVectorizer(stop_words = 'english', ngram_range = (1, 3), min_df = 1)

#BERTopic
final_topic_model = BERTopic(
    embedding_model = final_sentence_model,
    umap_model = final_umap_model,
    hdbscan_model = final_hdbscan_model,
    vectorizer_model = final_vectorizer_model,
    calculate_probabilities = False,
    verbose = True,
    low_memory = True,
    top_n_words = 10,
    nr_topics = 100
)

final_topics, _ = final_topic_model.fit_transform(docs)
2026-01-07 15:06:27,953 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████████████████████████| 3295/3295 [31:23<00:00,  1.75it/s]
2026-01-07 15:37:54,973 - BERTopic - Embedding - Completed ✓
2026-01-07 15:37:54,976 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-07 15:48:11,591 - BERTopic - Dimensionality - Completed ✓
2026-01-07 15:48:11,609 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-07 15:48:31,162 - BERTopic - Cluster - Completed ✓
2026-01-07 15:48:31,168 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-01-07 15:48:48,811 - BERTopic - Representation - Completed ✓
2026-01-07 15:48:48,883 - BERTopic - Topic reduction - Reducing number of topics
2026-01-07 15:48:49,258 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-07 15:49:09,789 - BERTopic - Representation - Completed ✓
2026-01-07 15:49:09,871 - BERTopic - Topic reduction - Reduced number of topics from 716 to 100
In [18]:
# get info for each topic
final_topic_model.get_topic_info().sort_values(by = 'Count', ascending = False).head(20)
Out[18]:
Topic Count Name Representation Representative_Docs
0 -1 62827 -1_app_people_like_just [app, people, like, just, matches, don, bumble, match, women, time] [Just like every other dating app, Better then Tinder ......., Good app]
1 0 7458 0_app_great app_great_app great [app, great app, great, app great, app great app, dating, good app, nice app, dating app, best] [Great app, Great app, great app]
2 1 3121 1_great great_great_nice_awesome [great great, great, nice, awesome, cool, nice great, great nice, great great great, amazing, awesome great] [great, Great, Great]
3 2 2994 2_good good_good good good_good_far good [good good, good good good, good, far good, good far, good far good, far good good, far, good good far, good pretty] [Very good, Good, good]
4 3 2478 3_facebook_log_upload_photo [facebook, log, upload, photo, sign, login, let, account, picture, number] [Why I can't upload photo?, I don't have facebook, I can't sign in with my Facebook]
5 4 1592 4_bumble_using bumble_bumble offline_using bumble offline [bumble, using bumble, bumble offline, using bumble offline, offline, using, account, blocked, ve, dating] [Bumble, It's Bumble., Bumble]
6 5 1495 5_offline_open_update_connection [offline, open, update, connection, network, keeps, load, internet, network connection, wifi] [Can't get on it says it's offline, Keeps saying I'm offline? (I'm not.), always offline]
7 6 1460 6_pay_boost_likes_free [pay, boost, likes, free, premium, spotlight, money, paid, liked, paying] [You have to pay to do anything, You have to pay, You have to pay]
8 7 1152 7_love_love love_cool_good love [love, love love, cool, good love, love good, cool love, good, like love, love love love, love like] [Love it, love it, Love it]
9 8 1072 8_tinder_better tinder_better_tinder better [tinder, better tinder, better, tinder better, tinder better tinder, better tinder better, way better tinder, way better, app better tinder, like tinder] [Better than Tinder, Better than tinder, better than tinder]
10 9 1049 9_fake_profiles_fake profiles_profiles fake [fake, profiles, fake profiles, profiles fake, accounts, real, fake accounts, inactive, profile, fakes] [Too many fake profiles, Too many fake profiles, Fake profiles]
11 10 978 10_matches_match_pay_pay matches [matches, match, pay, pay matches, matches matches, single match, app, match matches, money, swipe] [No matches, No matches, Give me more matches]
12 11 916 11_easy_easy use_use_use easy [easy, easy use, use, use easy, easy use easy, use easy use, simple, fun easy, interface, fun] [Easy to use., Easy to use., Easy to use]
13 12 909 12_backtrack_swipe_swipes_right [backtrack, swipe, swipes, right, swiping, swipe right, left, screen, day, app] [Swipe right, I swipe right too much., No one swipe right for u]
14 13 893 13_cancel_subscription_delete_cancel subscription [cancel, subscription, delete, cancel subscription, delete account, account, unsubscribe, charged, cancelled, charging] [How do I cancel my subscription?, Can't find how to cancel subscription, How do I cancel my subscription?]
15 14 814 14_women_message_girls_make [women, message, girls, make, ladies, woman, like, beautiful, women message, send] [i like how its women who get to make the first move., I like that women get to make the first move :)), I like that women make the first move.]
16 15 794 15_meh_eh_meh meh_yup [meh, eh, meh meh, yup, eh meh, meh eh, ehh, ya ya, ya ya ya, yep] [Meh, Meh, Meh]
17 16 733 16_ok_ok ok_okay_ok ok ok [ok, ok ok, okay, ok ok ok, ok okay, okay ok, alright, ok alright, ok ok okay, ok okay ok] [OK ok...., it's ok ok, ok ok]
18 17 732 17_blocked_18_banned_reason [blocked, 18, banned, reason, blocked reason, got, banned reason, got banned, account, got blocked] [Blocked me for no reason!, Blocked for no reason!, BLOCKED FOR NO REASON]
19 18 725 18_sucks_bad_lame_terrible [sucks, bad, lame, terrible, sucks sucks, horrible, bad bad, worst, bad sucks, sucks terrible] [Sucks, It sucks, Sucks]
In [22]:
# visualise topics - the map is quite spread out so topics are quite diverse. But there are also lots of
# overlapping topics that could be merged. Also lots of smaller topics that don't appear in the majority of
# documents
fig = final_topic_model.visualize_topics()
fig.show()
In [23]:
# find topic word scores for top ten topics - some look like noise and can be deleted. Topic 9 looks
# interesting
bar_charts = final_topic_model.visualize_barchart(topics = [i for i in range(10)])
bar_charts.show()
In [24]:
# retreive top N words for each topic
all_topics = final_topic_model.get_topics()
top_n = 10
top_words_per_topic = {topic: [word for word, _ in all_topics[topic][:top_n]] 
                       for topic in all_topics if topic != -1} # -1 are anomalies

# print top words for each topic
for topic, words in top_words_per_topic.items():
    print(f"Topic {topic}: {', '.join(words)}")
Topic 0: app, great app, great, app great, app great app, dating, good app, nice app, dating app, best
Topic 1: great great, great, nice, awesome, cool, nice great, great nice, great great great, amazing, awesome great
Topic 2: good good, good good good, good, far good, good far, good far good, far good good, far, good good far, good pretty
Topic 3: facebook, log, upload, photo, sign, login, let, account, picture, number
Topic 4: bumble, using bumble, bumble offline, using bumble offline, offline, using, account, blocked, ve, dating
Topic 5: offline, open, update, connection, network, keeps, load, internet, network connection, wifi
Topic 6: pay, boost, likes, free, premium, spotlight, money, paid, liked, paying
Topic 7: love, love love, cool, good love, love good, cool love, good, like love, love love love, love like
Topic 8: tinder, better tinder, better, tinder better, tinder better tinder, better tinder better, way better tinder, way better, app better tinder, like tinder
Topic 9: fake, profiles, fake profiles, profiles fake, accounts, real, fake accounts, inactive, profile, fakes
Topic 10: matches, match, pay, pay matches, matches matches, single match, app, match matches, money, swipe
Topic 11: easy, easy use, use, use easy, easy use easy, use easy use, simple, fun easy, interface, fun
Topic 12: backtrack, swipe, swipes, right, swiping, swipe right, left, screen, day, app
Topic 13: cancel, subscription, delete, cancel subscription, delete account, account, unsubscribe, charged, cancelled, charging
Topic 14: women, message, girls, make, ladies, woman, like, beautiful, women message, send
Topic 15: meh, eh, meh meh, yup, eh meh, meh eh, ehh, ya ya, ya ya ya, yep
Topic 16: ok, ok ok, okay, ok ok ok, ok okay, okay ok, alright, ok alright, ok ok okay, ok okay ok
Topic 17: blocked, 18, banned, reason, blocked reason, got, banned reason, got banned, account, got blocked
Topic 18: sucks, bad, lame, terrible, sucks sucks, horrible, bad bad, worst, bad sucks, sucks terrible
Topic 19: talks, luck, responses, response, replies, responds, activity, talk, haven, results
Topic 20: notifications, messages, notification, message, sound, match, matches, new, conversations, notification sound
Topic 21: works, works works, work, doesn work, work works, working, working works, doesn, works great, does
Topic 22: met, meet, way meet, place, place meet, meet new, new people, meet new people, people, people met
Topic 23: fun, fun fun, fun fun fun, boring, fun boring, boring fun, fun fun boring, boring fun fun, entertaining, good fun
Topic 24: gun, guns, photos, pictures, banning, post, firearms, picture, photo, shirtless
Topic 25: trash, garbage, trash trash, garbage garbage, trash app, trash garbage, garbage trash, rubbish, garbage app, garbage garbage garbage
Topic 26: gender, binary, bff, friends, non binary, men, nonbinary, genders, non, men women
Topic 27: ugly, just ugly, im, im ugly, lol, ugly app, just, ugly matches, maybe, likes
Topic 28: best, best best, superb, best best best, super best, superb best, best super, super, best superb, exceptional
Topic 29: 24, 24 hour, hour, hours, end line, 24 hours, line, limit, hit end, hit end line
Topic 30: waste, waste time, time waste, waste time waste, time waste time, time, money waste, don waste, don waste time, money waste time
Topic 31: blm, political, agree, lives matter, support, planned parenthood, parenthood, black, planned, politics
Topic 32: filters, advanced, free, free filters, advanced filters, filter, removed, free advanced filters, free advanced, removed free
Topic 33: slow, glitchy, glitches, buggy, bugs, slow slow, glitchy slow, bugs bugs, slow buggy, glitches glitches
Topic 34: worth, expensive, options, expensive expensive, limited, choices, rip, expensive worth, recommend, choice
Topic 35: experience, experience good experience, experience good, good experience, great experience, experience great experience, experience great, experience worst, experience worst experience, good experience good
Topic 36: hai, ko, ka, na, nahi, nhi, koi, bhi, ng, ako
Topic 37: gud, gr8, gg, grate, gud gud, gd, gr8 app, grt, gg gud, goid
Topic 38: site, great site, good site, site great, site great site, awesome site, site good, site good site, nice site, best site
Topic 39: distance, location, miles, away, change, set, gps, setting, search, mile
Topic 40: useless, useless useless, useless useless useless, useful, helpful, useless useful, useful useless, useless pointless, helpful useless, worthless
Topic 41: white, ethnicity, black, race, diversity, color, filter, men, black men, white people
Topic 42: love, love love, buzz, happy, gay, sexy, satisfied, hot, buzz buzz, happy love
Topic 43: coins, payment, coin, paytm, upi, bought coins, make payment, paid coins, bought, purchase coins
Topic 44: people, quality people, users, real, real people, great people, quality, people people, area, people real
Topic 45: scam, scammers, scam scam, scam scam scam, scammers scam, scams scam, money scam, men, scams, scam scammers
Topic 46: que, la, es, el, por, muy, para, las, buena, pero
Topic 47: better better, better, better better better, improvement, getting better, improvement better, good better, better good better, good better better, better getting better
Topic 48: stars, star, stars stars, 10, zero stars, stars star, zero, 10 10, gave stars, rating
Topic 49: interesting, wow, interesting interesting, neat, wow interesting, wow wow, interesting wow, interesting neat, interesting interesting interesting, neat interesting
Topic 50: bots, bots bots, bots app, bot, bots bots bots, people bots, app bots, real people bots, lots bots, bots far
Topic 51: download, install, uninstalled, installing, downloading, installed, download download, don download, won download, download install
Topic 52: service, customer, customer service, support, service customer, worst customer service, worst customer, horrible, customer support, customer service customer
Topic 53: bee, bees, beeline, knees, stung, queen, hive, honey, bee knees, queen bee
Topic 54: laid, got laid, dates, date, got, got date, laid got, laid dates, laid got laid, date dates
Topic 55: na, bakwas, sux, pwoli, supa, gaanddd, hj, nn, mmm, tip
Topic 56: aap, ap, good ap, good aap, aap good, dating aap, nice aap, ap good, aap nice, bad aap
Topic 57: connections, connect, good connections, connection, hookup, quality connections, connections great, connections connections, connections good, connecting
Topic 58: concept, great concept, good concept, concept great, concept good concept, idea, concept good, concept great concept, idea great concept, idea great
Topic 59: like like, like, like far, like don like, like far like, like different, like don, like like like, different, like like far
Topic 60: yellow, white, dark, background, dark mode, yellow background, mode, read, font, text
Topic 61: just started, started, joined, just joined, just, new, new just, started far, just started far, far
Topic 62: legit, fake fake, fake, fake legit, genuine, fake fake fake, legit fake, real, genuine fake, real fake
Topic 63: review, reviews, deleting, deleted, deleting review, review deleted, asking review, stop asking review, write review, deleting reviews
Topic 64: loved, good times, fun times, times, time good times, great time, time good, times loved, loved loved, good
Topic 65: aight, aight aight, aight aight aight, ight, ight aight, aight ight, aight aight ight, ight aight aight, aight ight aight, iight
Topic 66: pay win, grab, win, money, play, pay play, money grab, pay, grab money, money money
Topic 67: safe, using, started using, just started using, issues, problems, just started, secure, started, good safe
Topic 68: lit, solid, lit lit, lit solid, solid lit, solid solid, lit lit lit, lit lit solid, lit solid lit, lit solid solid
Topic 69: ghosting, ghost, ghosted, creeps, creepers, ghosts, creepy, ghost town, getting ghosted, people ghost
Topic 70: platform, great platform, platform meet, good platform, platform connect, people great platform, platform meet new, nice platform, platform best, platform best platform
Topic 71: good good, good nice, good good nice, good good good, nice good good, nice good, good, good nice good, nice, nice nice
Topic 72: pof, better pof, better, pof better, pof better pof, way better pof, better pof better, way better, poon, pof ok
Topic 73: algorithm, algorithm algorithm, algorithms, worst algorithm, terrible algorithm, algorithm bad, like algorithm, hoping, meaningful, writing
Topic 74: kool, hoes, hoes kool, hoes ain, ain, hoes ain loyal, bunch hoes, ain loyal, scum hoes ain, scum hoes
Topic 75: poor poor, poor poor poor, poor, belowe people, avoid costs poor, issues belowe people, intersted social, fun intersted, issues belowe, intersted social work
Topic 76: catfish, fish, fish catfish, catfishing, lots catfish, catfish lots catfish, like catfish, sharks, tuna, catfish lots
Topic 77: format, like format, format love, format like, love format, format like format, format love format, format easy, great format, new format
Topic 78: sure sure, sure, sure sure sure, don know, know don know, know don, know, know sure, sure don, sure sure don
Topic 79: smooth, quality, quality smooth, quality quality, smooth quality, smooth nice smooth, quality quality smooth, quality smooth quality, smooth smooth, smooth nice
Topic 80: hinge, better hinge, hinge better, better, way better, hinge way better, hinge way, hinge better hinge, way better hinge, hinge hinge
Topic 81: goood, goo, goood goood, goooood, gooood, goof, goog, excellent goood, experience goooood goood, dating fantastic goooooood
Topic 82: noice, noice noice, noice noice noice, itza, itza niice, noce noice, noce, niice noicee, itza niice noicee, noice noce
Topic 83: lauren, lauren smith, smith, justice lauren, justice, smith fields, lauren smith fields, fields, justice lauren smith, fields justice
Topic 84: bizz, mode, biz, bizz mode, bizz section, careful, really enjoying bizz, modes, enjoying bizz, commerce
Topic 85: classy, class, classy classy, classy class, classy classy classy, class classy, classy people, better class, class people classier, class needed
Topic 86: greedy, greedy app, greedy greedy, app greedy, greedy app greedy, greedy greedy app, app greedy app, got greedy, app greedy company, app greedy developer
Topic 87: boys dont like, dont like just, case families, case families just, families just, everybody watch, everybody watch mom, elitist real people, elitist real, watch mom kind
Topic 88: blah, blah blah, blah blah blah, blahblah blah, blahh, blahh blah, blah blahh, blah blahblah blah, blah blah blahh, blah blahblah
Topic 89: difficulty, lower difficulty, turn difficulty, difficulty settings, lower, lower difficulty settings, turn, settings, difficulty settings lower, settings lower
Topic 90: , , , , , , , , , 
Topic 91: thumbs, thumbs thumbs, thumbs thumbs thumbs, big thumbs thumbs, kudos thumbs double, thumb thumbs thumbs, thumbs big, thumb thumbs, thumbs kudos, thumbs double thumbs
Topic 92: crop, cropped, photos, crop pictures, photo, cropping, pictures, crop photos, zoom, annoying
Topic 93: und, ich, die, nicht, ein, ihr, ist, wenn, geht, aber
Topic 94: average average, average, average average average, average average best, average best, average af average, average average af, 12 average average, af average average, af average
Topic 95: bs bs, bs, bs bs bs, bs bs site, bs features nope, feel like bs, bs feel, bs feel like, bs site, bs site pretty
Topic 96: не, за, это, все, слишком, что, как, приложение, только, для
Topic 97: fan, fan fan, fan fan fan, big fan absolutely, alright fan portland, big ol, big ol fan, alright fan, absolutely fan, absolutely fan flipping
Topic 98: shall, knows till look, descent shall shall, downhill knows, downhill knows till, good went downhill, let happens let, knows till, happens let, ll shall ya
In [25]:
# put topic words in a dataframe so we can merge with main dataframe
topic_words_df = pd.DataFrame([
    {'topic': topic, 'top_words': ', '.join(words)} for topic, words in top_words_per_topic.items()
])
In [26]:
# merge topic and words back to topic and main dataframe
review_content['topic'] = final_topics

review_content = review_content.merge(topic_words_df, on = 'topic', how = 'left')

final_df = reviews.merge(review_content, on = 'reviewId', how = 'inner')

final_df = final_df.drop(['userName', 'userImage', 'thumbsUpCount', 'reviewCreatedVersion',
                         'at', 'replyContent', 'repliedAt', 'content_y'], axis = 1)

final_df.head()
Out[26]:
reviewId content_x score topic top_words
0 gp:AOqpTOGCWX1s8NNKNYQ5uMKTqiOTl1wci0EXqB1YMF4KnryRJC4l3TL2crSvzhwIXIjA3u6_wlKbs_4md1c7Vg Bumble rocks 👍🤞 5 4 bumble, using bumble, bumble offline, using bumble offline, offline, using, account, blocked, ve, dating
1 gp:AOqpTOFmBlHJLKFu87RU2d-AKSXiIzUd_utDNWBKhswdrUJKFpCNitITSHp77BP-5Wm71YcxzkkT_Pq1k9EFwQ Just a cash grab. Congrats you have 2 new likes! Here we'll let you swipe right until you find them. Oh no, you're out of likes, but don't worry we'll let you buy more for $19 a month. 1 -1 NaN
2 gp:AOqpTOHAcFeMyfgGN_6k7HRCTRwYS2q1mKCA_t5YIkfN_GoU0sIzR8f7VhR6tjO6x-tXqsDv-K7wDHYcH__l_g Terrible, l have lost total faith in this app, promised to contact me but have never came back since start of Feb, shame on this app. Worst ever 1 -1 NaN
3 gp:AOqpTOEinp2G2V2iYesrwB6DaGM4ygaELm2uJlc3z46TFDkAzseIUYiF0NyrHGRcbGGXxs2guTLbewoW8gqtng Easy to use 5 11 easy, easy use, use, use easy, easy use easy, use easy use, simple, fun easy, interface, fun
4 gp:AOqpTOG4ElAfNPCFwqm-WSRa9SddGNRvgxIosHcujGVv178-L3NEj-3pO62RN7wEXofIz_Io4lyGpxgPTgykzg Just alot of gimmicks, fake profile, and upsales. Oh and if you decide to delete your account and leave a negative review, they block your email, or phone number from re-registering in the future lol. 1 -1 NaN

Export to SQL

In [51]:
# import modules for sql export
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
In [52]:
# Load the .env file from project directory
load_dotenv(dotenv_path = "/home/amybirdee/hobby_projects/.env")
Out[52]:
True
In [53]:
# Fetch environment variables
DB_HOST = os.getenv("DB_HOST")
DB_PORT = os.getenv("DB_PORT")
DB_NAME = os.getenv("DB_NAME")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
In [54]:
# Build connection URL
db_url = (
    f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}"
    f"@{DB_HOST}:{DB_PORT}/{DB_NAME}"
)

engine = create_engine(db_url)
In [56]:
# write to sql
final_df.to_sql(
    'bumble_review_topics',
    engine,
    if_exists = 'replace',
    index = False,
    method = 'multi',
    chunksize = 5000
)
Out[56]:
105429
In [ ]: